import os
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns
from feature_engine.selection import SmartCorrelatedSelection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import polars as pl
# Path needs to be added manually to read from another folder
path2add = os.path.normpath(
os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir, "utils"))
)
if not (path2add in sys.path):
sys.path.append(path2add)
from feature_engineering import aggregate_node_features, get_graph_features
%load_ext autoreload
%autoreload 2
plotly.offline.init_notebook_mode()
Data¶
We will import the supervised_call_graphs.json file to see if there are any features that can be engineered, because the clean_data_supervised.parquet file already contains descriptive features. This json file contains graph data, so before this data can be analyzed, the data must be processed.
apidf = pl.read_parquet('../data/clean_data_supervised.parquet')
callsdf = pl.read_json('../data/supervised_call_graphs.json')
callsdf.head()
| _id | call_graph |
|---|---|
| str | list[struct[2]] |
| "1f2c32d8-2d6e-3b68-bc46-789469… | [{"1f873432-6944-3df9-8300-8a3cf9f95b35","5862055b-35a6-316a-8e20-3ae20c1763c2"}, {"8955faa9-0e33-37ad-a1dc-f0e640a114c2","a4fd6415-1fd4-303e-aa33-bb1830b5d9d4"}, … {"016099ea-6f20-3fec-94cf-f7afa239f398","6fa8ad53-2f0d-3f44-8863-139092bfeda9"}] |
| "4c486414-d4f5-33f6-b485-24a8ed… | [{"016099ea-6f20-3fec-94cf-f7afa239f398","946e3ced-48a5-3de5-ad5a-1d20b1ab7eb5"}, {"a05a261f-128d-3cd8-a8e1-d6e52e161947","375c16ea-5f8d-32d5-8893-639d9b3a53d6"}, … {"68acdde8-bd53-39d1-9be0-fd67a281d7be","d7a53acc-eb6e-3f6c-b72e-9aefb54dd311"}] |
| "7e5838fc-bce1-371f-a3ac-d8a0b2… | [{"1f873432-6944-3df9-8300-8a3cf9f95b35","5862055b-35a6-316a-8e20-3ae20c1763c2"}, {"857c4b20-3057-30e0-9ca3-d6f5c3dbe4a6","857c4b20-3057-30e0-9ca3-d6f5c3dbe4a6"}, … {"016099ea-6f20-3fec-94cf-f7afa239f398","6fa8ad53-2f0d-3f44-8863-139092bfeda9"}] |
| "82661ecd-d87f-3dff-855e-378f7c… | [{"47896677-7c81-381f-8d03-3b2c94a27fdc","8244a4e7-5f5e-384e-b6a1-d6f065cecb11"}, {"089d44f6-bdf6-3a42-886a-db8e427fd2e0","756ab2fe-a386-32dd-9a4e-18785c38a414"}, … {"22d3028b-b12a-34d7-b641-886ab54ae6ff","22d3028b-b12a-34d7-b641-886ab54ae6ff"}] |
| "d62d56ea-775e-328c-8b08-db7ad7… | [{"876b4958-7df1-3b2b-9def-1a22f1d444e3","aadf8ca9-ffda-30f0-bacf-2203e80c0811"}, {"cb8ef584-d1ad-3d44-a328-792f6556c23f","be305dea-fec3-3b4d-92b4-6cb88038c4cc"}, … {"d7d9c8e7-fbe2-3195-a903-20ab61c63de7","0fd98078-d00b-36c2-b067-79baa4e93068"}] |
Pre-process graph data¶
calls_processed = (
callsdf.with_columns(
pl.col("call_graph").list.eval(
pl.element().struct.rename_fields(["from", "to"])
)
)
.explode("call_graph")
.unnest("call_graph")
)
calls_processed.head()
| _id | from | to |
|---|---|---|
| str | str | str |
| "1f2c32d8-2d6e-3b68-bc46-789469… | "1f873432-6944-3df9-8300-8a3cf9… | "5862055b-35a6-316a-8e20-3ae20c… |
| "1f2c32d8-2d6e-3b68-bc46-789469… | "8955faa9-0e33-37ad-a1dc-f0e640… | "a4fd6415-1fd4-303e-aa33-bb1830… |
| "1f2c32d8-2d6e-3b68-bc46-789469… | "85754db8-6a55-30b7-8558-dec75f… | "85754db8-6a55-30b7-8558-dec75f… |
| "1f2c32d8-2d6e-3b68-bc46-789469… | "9f08fee1-953c-3801-b254-c0256f… | "876b4958-7df1-3b2b-9def-1a22f1… |
| "1f2c32d8-2d6e-3b68-bc46-789469… | "857c4b20-3057-30e0-9ca3-d6f5c3… | "857c4b20-3057-30e0-9ca3-d6f5c3… |
Feature Engineering¶
We observe that each graph has a separate _id that can be later used to join to the main dataset. A graph consists of source and destination nodes which refer to the available API calls.
Basic Graph Level Features¶
The most basic graph-level that we can engineer are:
- Number of edges (connections)
- Number of nodes (APIs)
These features could be useful since most behaviors are going to have a "normal" range of APIs that they contact. If this number is too large or too small, this might be an indication of anomalous activity.
graph_features = calls_processed.group_by('_id').agg(
pl.len().alias('n_connections'),
pl.col('from'),
pl.col('to')
).with_columns(
pl.concat_list('from', 'to').list.unique().list.len().alias('n_unique_nodes')
).select([
'_id',
'n_connections',
'n_unique_nodes'
])
graph_features.sample(3)
| _id | n_connections | n_unique_nodes |
|---|---|---|
| str | u32 | u32 |
| "0cdc6111-dc79-32d5-a221-5ba3df… | 28 | 20 |
| "cf3ebfff-a72b-3958-b863-e85de3… | 15 | 10 |
| "2e18a413-5ebf-3c4e-9dc5-47cfed… | 242 | 69 |
Node Level Features¶
Since graphs consist of nodes, we can engineer a set of features around specific nodes (APIs). We can calculate:
- Node degrees - the number of edges that come from/into a node. Very highly connected nodes can look anomalous
- Node centrality - there are various centrality measures (e.g. Page Rank) but they all try to estimate how important to the whole graph is a specific node. This feature could be useful because a behavior pattern that doesn't touch any of the "central" APIs would look anomalous
These features can be broken down into:
- global features - measure node attributes across all the graphs
- local features - measure node attributes across a specific graph
calls_processed = calls_processed.with_columns(
global_source_degrees = pl.len().over(pl.col('from')),
global_dest_degrees = pl.len().over(pl.col('to')),
local_source_degrees = pl.len().over(pl.col('from'), pl.col('_id')),
local_dest_degrees = pl.len().over(pl.col('to'), pl.col('_id'))
)
calls_processed.sample(3)
| _id | from | to | global_source_degrees | global_dest_degrees | local_source_degrees | local_dest_degrees |
|---|---|---|---|---|---|---|
| str | str | str | u32 | u32 | u32 | u32 |
| "8a989644-d121-315b-8206-deae98… | "d68e78ab-d01a-35b5-b816-7d715c… | "1d768e1f-ee4c-3486-9263-432754… | 375 | 1035 | 7 | 8 |
| "3bcc9249-8bf7-3622-b0ca-1c6b26… | "756ab2fe-a386-32dd-9a4e-18785c… | "699caece-830f-3194-ae24-7b4563… | 6808 | 711 | 24 | 6 |
| "74931360-c3a3-353a-8dfc-72030c… | "dab3f781-7cce-3286-a667-c9f295… | "756ab2fe-a386-32dd-9a4e-18785c… | 2406 | 22416 | 2 | 31 |
Now that the node-level features are calculated, we need to aggregate them for a specific graph (_id). When aggregating, we can calculate average, std, min, and max statistics for every feature to capture the distribution well.
node_features_agg = aggregate_node_features(
calls_processed,
node_features=[
"global_source_degrees",
"global_dest_degrees",
"local_source_degrees",
"local_dest_degrees",
],
by="_id",
)
graph_features = graph_features.join(node_features_agg, on="_id")
graph_features.head()
| _id | n_connections | n_unique_nodes | avg_global_source_degrees | min_global_source_degrees | max_global_source_degrees | std_global_source_degrees | avg_global_dest_degrees | min_global_dest_degrees | max_global_dest_degrees | std_global_dest_degrees | avg_local_source_degrees | min_local_source_degrees | max_local_source_degrees | std_local_source_degrees | avg_local_dest_degrees | min_local_dest_degrees | max_local_dest_degrees | std_local_dest_degrees |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | u32 | u32 | f64 | u32 | u32 | f64 | f64 | u32 | u32 | f64 | f64 | u32 | u32 | f64 | f64 | u32 | u32 | f64 |
| "be3aae78-b140-37f9-af2d-da867f… | 170 | 37 | 8085.788235 | 333 | 32071 | 8264.750607 | 9182.858824 | 474 | 22416 | 7620.343692 | 8.317647 | 1 | 16 | 4.557877 | 8.152941 | 1 | 18 | 4.706802 |
| "66561100-a361-3636-a81b-342d55… | 53 | 30 | 223.566038 | 6 | 596 | 175.431612 | 479.584906 | 5 | 1151 | 408.467089 | 2.54717 | 1 | 5 | 1.380719 | 5.113208 | 1 | 10 | 3.256121 |
| "73e1fe84-da14-309e-8783-0da801… | 104 | 48 | 8211.971154 | 111 | 32071 | 9959.341617 | 10563.759615 | 80 | 22416 | 9094.504387 | 4.673077 | 1 | 14 | 4.475266 | 6.846154 | 1 | 16 | 6.07201 |
| "b49d6f27-2c83-3fa3-9646-5a3573… | 75 | 42 | 218.2 | 2 | 596 | 201.588623 | 346.706667 | 7 | 1151 | 368.708036 | 3.08 | 1 | 8 | 2.258677 | 4.413333 | 1 | 11 | 3.556848 |
| "42752144-40ff-3abc-b88c-adf61f… | 3 | 3 | 24848.666667 | 10404 | 32071 | 12509.448283 | 14065.0 | 1217 | 20489 | 11126.694388 | 1.666667 | 1 | 2 | 0.57735 | 1.666667 | 1 | 2 | 0.57735 |
Feature Selection¶
Feature selection will be done using 2 steps:
- Quality checks - if the feature is constant or has too many missing values (>= 95%) it will be dropped
- Correlation analysis - if features have very high correlation (>= 95%) with each other, they can be dropped as well
engineered_features = graph_features.columns[1:]
engineered_features
['n_connections', 'n_unique_nodes', 'avg_global_source_degrees', 'min_global_source_degrees', 'max_global_source_degrees', 'std_global_source_degrees', 'avg_global_dest_degrees', 'min_global_dest_degrees', 'max_global_dest_degrees', 'std_global_dest_degrees', 'avg_local_source_degrees', 'min_local_source_degrees', 'max_local_source_degrees', 'std_local_source_degrees', 'avg_local_dest_degrees', 'min_local_dest_degrees', 'max_local_dest_degrees', 'std_local_dest_degrees']
Quality checks¶
null_counts = graph_features.null_count().transpose(include_header=True, header_name='col', column_names=['null_count'])
null_counts.filter(pl.col('null_count') > 0)
| col | null_count |
|---|---|
| str | u32 |
| "std_global_source_degrees" | 42 |
| "std_global_dest_degrees" | 42 |
| "std_local_source_degrees" | 42 |
| "std_local_dest_degrees" | 42 |
static_features = graph_features.select(engineered_features).std().transpose(include_header=True, header_name='col', column_names=['std'])
static_features.filter(pl.col('std') == 0)
| col | std |
|---|---|
| str | f64 |
Correlation Analysis¶
feature_corrs = graph_features.select(engineered_features).to_pandas().dropna().corr()
feature_corrs.index = feature_corrs.columns
matrix = np.triu(feature_corrs)
fig = plt.figure(figsize=(20, 10))
sns.heatmap(feature_corrs, annot=True, mask=matrix)
<Axes: >
There are groups of highly correlated features. Applying SmartCorrelatedSelection should reduce the feature set of engineered features.
features_pd = graph_features.select(engineered_features).to_pandas().dropna()
tr = SmartCorrelatedSelection(
variables=None,
method="pearson",
threshold=0.95,
missing_values="raise",
selection_method="variance",
estimator=None,
)
tr.fit(features_pd)
print('Features to drop:')
for f in tr.features_to_drop_:
print(f)
Features to drop: std_global_dest_degrees n_unique_nodes max_local_source_degrees max_local_dest_degrees std_local_dest_degrees avg_local_dest_degrees avg_local_source_degrees
Observations¶
- The engineered features have groups of high correlation
Impact¶
- std_global_dest_degrees, n_unique_nodes, max_local_source_degrees, max_local_dest_degrees, std_local_dest_degrees, avg_local_dest_degrees, avg_local_source_degrees will be dropped due to having high correlations and low variances compared with the other features
EDA for Remaining Engineered Features¶
remaining_engineered_features = list(set(features_pd).difference(set(tr.features_to_drop_)))
graph_features_merged = graph_features.join(apidf.select(['_id', 'is_anomaly']), on='_id')
remaining_engineered_features
['std_local_source_degrees', 'min_local_source_degrees', 'avg_global_source_degrees', 'max_global_dest_degrees', 'max_global_source_degrees', 'std_global_source_degrees', 'min_local_dest_degrees', 'min_global_source_degrees', 'avg_global_dest_degrees', 'n_connections', 'min_global_dest_degrees']
graph_features_merged.head()
| _id | n_connections | n_unique_nodes | avg_global_source_degrees | min_global_source_degrees | max_global_source_degrees | std_global_source_degrees | avg_global_dest_degrees | min_global_dest_degrees | max_global_dest_degrees | std_global_dest_degrees | avg_local_source_degrees | min_local_source_degrees | max_local_source_degrees | std_local_source_degrees | avg_local_dest_degrees | min_local_dest_degrees | max_local_dest_degrees | std_local_dest_degrees | is_anomaly |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | u32 | u32 | f64 | u32 | u32 | f64 | f64 | u32 | u32 | f64 | f64 | u32 | u32 | f64 | f64 | u32 | u32 | f64 | bool |
| "1f2c32d8-2d6e-3b68-bc46-789469… | 2821 | 447 | 4055.665012 | 3 | 32071 | 6840.719715 | 4547.629918 | 2 | 22416 | 6567.762529 | 25.519674 | 1 | 126 | 30.889073 | 30.768167 | 1 | 164 | 40.924937 | false |
| "4c486414-d4f5-33f6-b485-24a8ed… | 1270 | 280 | 5174.526772 | 3 | 32071 | 7527.970754 | 5858.023622 | 12 | 22416 | 7074.265666 | 16.270866 | 1 | 76 | 19.458242 | 18.710236 | 1 | 65 | 20.34814 | false |
| "7e5838fc-bce1-371f-a3ac-d8a0b2… | 1589 | 354 | 4174.369415 | 3 | 32071 | 7048.385421 | 4814.517306 | 2 | 22416 | 6676.398634 | 14.921963 | 1 | 79 | 17.700993 | 19.623033 | 1 | 93 | 24.258697 | false |
| "82661ecd-d87f-3dff-855e-378f7c… | 459 | 114 | 5867.786492 | 12 | 32071 | 7153.580321 | 6689.276688 | 10 | 22416 | 6900.948899 | 10.063181 | 1 | 27 | 7.321257 | 10.755991 | 1 | 28 | 8.03984 | false |
| "d62d56ea-775e-328c-8b08-db7ad7… | 89 | 23 | 6914.842697 | 53 | 32071 | 10320.004581 | 5613.41573 | 38 | 22416 | 7792.410521 | 5.921348 | 1 | 12 | 3.163085 | 5.157303 | 1 | 9 | 2.250752 | false |
remaining_features_df = graph_features_merged.select(['_id'] + remaining_engineered_features + ['is_anomaly'])
remaining_features_df.head()
| _id | std_local_source_degrees | min_local_source_degrees | avg_global_source_degrees | max_global_dest_degrees | max_global_source_degrees | std_global_source_degrees | min_local_dest_degrees | min_global_source_degrees | avg_global_dest_degrees | n_connections | min_global_dest_degrees | is_anomaly |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | f64 | u32 | f64 | u32 | u32 | f64 | u32 | u32 | f64 | u32 | u32 | bool |
| "1f2c32d8-2d6e-3b68-bc46-789469… | 30.889073 | 1 | 4055.665012 | 22416 | 32071 | 6840.719715 | 1 | 3 | 4547.629918 | 2821 | 2 | false |
| "4c486414-d4f5-33f6-b485-24a8ed… | 19.458242 | 1 | 5174.526772 | 22416 | 32071 | 7527.970754 | 1 | 3 | 5858.023622 | 1270 | 12 | false |
| "7e5838fc-bce1-371f-a3ac-d8a0b2… | 17.700993 | 1 | 4174.369415 | 22416 | 32071 | 7048.385421 | 1 | 3 | 4814.517306 | 1589 | 2 | false |
| "82661ecd-d87f-3dff-855e-378f7c… | 7.321257 | 1 | 5867.786492 | 22416 | 32071 | 7153.580321 | 1 | 12 | 6689.276688 | 459 | 10 | false |
| "d62d56ea-775e-328c-8b08-db7ad7… | 3.163085 | 1 | 6914.842697 | 22416 | 32071 | 10320.004581 | 1 | 53 | 5613.41573 | 89 | 38 | false |
remaining_features_df.columns[1:-1]
['std_local_source_degrees', 'min_local_source_degrees', 'avg_global_source_degrees', 'max_global_dest_degrees', 'max_global_source_degrees', 'std_global_source_degrees', 'min_local_dest_degrees', 'min_global_source_degrees', 'avg_global_dest_degrees', 'n_connections', 'min_global_dest_degrees']
remaining_features_df.select(pl.corr('min_local_dest_degrees', 'is_anomaly')).item()
0.1871244054310182
for col in remaining_features_df.columns[1:-1]:
correl = remaining_features_df.select(pl.corr(col, "is_anomaly")).item()
print(f'{col}: {correl}')
std_local_source_degrees: -0.4599070276100464 min_local_source_degrees: 0.17967814928163506 avg_global_source_degrees: 0.019838141497681823 max_global_dest_degrees: -0.21899813519545483 max_global_source_degrees: -0.18965286839023868 std_global_source_degrees: -0.024380534174284717 min_local_dest_degrees: 0.1871244054310182 min_global_source_degrees: 0.19599078710208603 avg_global_dest_degrees: -0.13069273069548185 n_connections: -0.3707623919264986 min_global_dest_degrees: 0.3039360455504536
Feature Importances¶
A Random Forest will be trained on the engineered features to determine feature importances. Because several of these features have high cardinality and impurity-based importances are biased towards this, I will also determine permutation importances.
X = remaining_features_df.select(remaining_features_df.columns[1:-1]).to_numpy()
y = remaining_features_df.select('is_anomaly').to_numpy().ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state = 0)
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=20, random_state=0)
rf.fit(X_train, y_train)
RandomForestClassifier(min_samples_leaf=20, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(min_samples_leaf=20, random_state=0)
importances = rf.feature_importances_
feature_names = remaining_features_df.columns[1:-1]
feature_importance_df = pl.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort(by='Importance', descending=True)
feature_importance_df
| Feature | Importance |
|---|---|
| str | f64 |
| "n_connections" | 0.292084 |
| "std_local_source_degrees" | 0.267337 |
| "std_global_source_degrees" | 0.110376 |
| "max_global_dest_degrees" | 0.077428 |
| "avg_global_source_degrees" | 0.065096 |
| … | … |
| "min_global_source_degrees" | 0.049449 |
| "avg_global_dest_degrees" | 0.046397 |
| "max_global_source_degrees" | 0.037231 |
| "min_local_source_degrees" | 0.000789 |
| "min_local_dest_degrees" | 0.0 |
rf_imp_fig = px.bar(feature_importance_df, x='Feature', y='Importance')
rf_imp_fig.show()
r = permutation_importance(rf, X_test, y_test, n_repeats=50, random_state=0)
perm_importances = r.importances_mean
perm_importance_df = pl.DataFrame({'Feature': feature_names, 'Importance': perm_importances})
perm_importance_df = perm_importance_df.sort(by='Importance', descending=True)
perm_importance_df
| Feature | Importance |
|---|---|
| str | f64 |
| "n_connections" | 0.202124 |
| "std_local_source_degrees" | 0.1541 |
| "std_global_source_degrees" | 0.056578 |
| "avg_global_source_degrees" | 0.035044 |
| "max_global_dest_degrees" | 0.029499 |
| … | … |
| "min_global_dest_degrees" | 0.01351 |
| "avg_global_dest_degrees" | 0.012802 |
| "min_local_source_degrees" | 0.003127 |
| "max_global_source_degrees" | 0.00059 |
| "min_local_dest_degrees" | 0.000059 |
perm_fig = px.bar(perm_importance_df, x='Feature', y='Importance')
perm_fig.show()
Observations¶
- min_local_source_degrees and min_local_dest degrees have the lowest importance scores in the impurity-based feature importance, and are the bottom 2 out of 3 in the permutation importance score
- most of the predictive features are global
Impact¶
- min_local_source_degrees and min_local_dest degrees should be dropped
- a tree-based model should be used to model the relationships between the engineered features
remaining_engineered_features = [f for f in remaining_engineered_features if f not in ['min_local_dest_degrees', 'min_local_source_degrees']]
print('Final engineered featureset:')
print(remaining_engineered_features)
Final engineered featureset: ['std_local_source_degrees', 'avg_global_source_degrees', 'max_global_dest_degrees', 'max_global_source_degrees', 'std_global_source_degrees', 'min_global_source_degrees', 'avg_global_dest_degrees', 'n_connections', 'min_global_dest_degrees']
Feature Engineering Pipeline¶
selected_features = ['std_local_source_degrees',
'avg_global_source_degrees',
'max_global_dest_degrees',
'max_global_source_degrees',
'std_global_source_degrees',
'min_global_source_degrees',
'avg_global_dest_degrees',
'n_connections',
'min_global_dest_degrees']
callsdf = (
(
pl.read_json("../data/supervised_call_graphs.json")
.with_columns(
pl.col("call_graph").list.eval(
pl.element().struct.rename_fields(["from", "to"])
)
)
.explode("call_graph")
.unnest("call_graph")
)
.with_columns(
global_source_degrees=pl.len().over(pl.col("from")),
global_dest_degrees=pl.len().over(pl.col("to")),
local_source_degrees=pl.len().over(pl.col("from"), pl.col("_id")),
local_dest_degrees=pl.len().over(pl.col("to"), pl.col("_id")),
)
.pipe(get_graph_features)
.select(["_id"] + selected_features)
)
pl.read_parquet("../data/clean_data_supervised.parquet").join(
callsdf, on="_id"
).write_parquet("../data/features_clean_data_supervised.parquet")
Summary¶
Feature Engineering Summary¶
- 18 new features were engineered, which measured graph and node related features
- Graph-level features measure the total size of the graphs
- Node level features measure the degrees on global and local levels
- 7 features were dropped due to high correlation within group
- 2 more features were dropped due to low feature importance scores
Implications for ML¶
- Engineered and selected 9 features could be useful in the prediction task, so they should be included into the final model
- Feature engineering pipeline was designed, so new data can be easily transformed